library(tidyverse)
job <- read_csv('dataset/fake_job_postings.csv')
Parsed with column specification:
cols(
  job_id = col_double(),
  title = col_character(),
  location = col_character(),
  department = col_character(),
  salary_range = col_character(),
  company_profile = col_character(),
  description = col_character(),
  requirements = col_character(),
  benefits = col_character(),
  telecommuting = col_double(),
  has_company_logo = col_double(),
  has_questions = col_double(),
  employment_type = col_character(),
  required_experience = col_character(),
  required_education = col_character(),
  industry = col_character(),
  `function` = col_character(),
  fraudulent = col_double()
)
job
glimpse(job)
Rows: 17,880
Columns: 18
$ job_id              <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54,...
$ title               <chr> "Marketing Intern", "Customer Service - Cloud Video Production", "Commissioning Machinery Assistant (CMA)", "Account Executive - Washington DC", "Bill Review Manager", "Accounting Clerk", "Head of Content (...
$ location            <chr> "US, NY, New York", "NZ, , Auckland", "US, IA, Wever", "US, DC, Washington", "US, FL, Fort Worth", "US, MD,", "DE, BE, Berlin", "US, CA, San Francisco", "US, FL, Pensacola", "US, AZ, Phoenix", "US, NJ, Jers...
$ department          <chr> "Marketing", "Success", NA, "Sales", NA, NA, "ANDROIDPIT", NA, NA, NA, NA, "HR", NA, NA, "Sales", "Sales", "R&D", NA, NA, NA, NA, NA, "Engagement", "Businessfriend.com", NA, NA, "Marketing", "Medical", NA, ...
$ salary_range        <chr> NA, NA, NA, NA, NA, NA, "20000-28000", NA, NA, NA, "100000-120000", NA, NA, NA, NA, "120000-150000", NA, NA, NA, NA, NA, NA, NA, "100000-120000", NA, NA, NA, NA, NA, NA, NA, "50000-65000", NA, NA, NA, NA, N...
$ company_profile     <chr> "We're Food52, and we've created a groundbreaking and award-winning cooking site. We support, connect, and celebrate home cooks, and give them everything they need in one place.We have a top editorial, busi...
$ description         <chr> "Food52, a fast-growing, James Beard Award-winning online food community and crowd-sourced and curated recipe hub, is currently interviewing full- and part-time unpaid interns to work in a small team of edi...
$ requirements        <chr> "Experience with content management systems a major plus (any blogging counts!)Familiar with the Food52 editorial voice and aestheticLoves food, appreciates the importance of home cooking and cooking with t...
$ benefits            <chr> NA, "What you will get from usThrough being part of the 90 Seconds team you will gain:experience working on projects located around the world with an international brandexperience working with a variety of ...
$ telecommuting       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
$ has_company_logo    <dbl> 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0,...
$ has_questions       <dbl> 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0,...
$ employment_type     <chr> "Other", "Full-time", NA, "Full-time", "Full-time", NA, "Full-time", NA, "Full-time", "Part-time", "Full-time", NA, "Full-time", "Full-time", "Full-time", "Full-time", "Full-time", NA, NA, "Full-time", NA, ...
$ required_experience <chr> "Internship", "Not Applicable", NA, "Mid-Senior level", "Mid-Senior level", NA, "Mid-Senior level", NA, "Associate", "Entry level", "Mid-Senior level", NA, "Associate", "Not Applicable", "Associate", "Execu...
$ required_education  <chr> NA, NA, NA, "Bachelor's Degree", "Bachelor's Degree", NA, "Master's Degree", NA, NA, "High School or equivalent", "Bachelor's Degree", NA, "Bachelor's Degree", "Unspecified", "Bachelor's Degree", "Bachelor'...
$ industry            <chr> NA, "Marketing and Advertising", NA, "Computer Software", "Hospital & Health Care", NA, "Online Media", NA, "Information Technology and Services", "Financial Services", "Information Technology and Services"...
$ `function`          <chr> "Marketing", "Customer Service", NA, "Sales", "Health Care Provider", NA, "Management", NA, NA, "Customer Service", "Information Technology", NA, "Information Technology", "Other", "Sales", "Sales", "Engine...
$ fraudulent          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
summary(job)
     job_id         title             location          department        salary_range       company_profile    description        requirements         benefits         telecommuting    has_company_logo has_questions    employment_type   
 Min.   :    1   Length:17880       Length:17880       Length:17880       Length:17880       Length:17880       Length:17880       Length:17880       Length:17880       Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Length:17880      
 1st Qu.: 4471   Class :character   Class :character   Class :character   Class :character   Class :character   Class :character   Class :character   Class :character   1st Qu.:0.0000   1st Qu.:1.0000   1st Qu.:0.0000   Class :character  
 Median : 8940   Mode  :character   Mode  :character   Mode  :character   Mode  :character   Mode  :character   Mode  :character   Mode  :character   Mode  :character   Median :0.0000   Median :1.0000   Median :0.0000   Mode  :character  
 Mean   : 8940                                                                                                                                                           Mean   :0.0429   Mean   :0.7953   Mean   :0.4917                     
 3rd Qu.:13410                                                                                                                                                           3rd Qu.:0.0000   3rd Qu.:1.0000   3rd Qu.:1.0000                     
 Max.   :17880                                                                                                                                                           Max.   :1.0000   Max.   :1.0000   Max.   :1.0000                     
 required_experience required_education   industry           function           fraudulent     
 Length:17880        Length:17880       Length:17880       Length:17880       Min.   :0.00000  
 Class :character    Class :character   Class :character   Class :character   1st Qu.:0.00000  
 Mode  :character    Mode  :character   Mode  :character   Mode  :character   Median :0.00000  
                                                                              Mean   :0.04843  
                                                                              3rd Qu.:0.00000  
                                                                              Max.   :1.00000  

There are 17,880 records. 4.843% jobs posted are fraudulent. We’re dealing with a class-imbalanced problem.

Data Cleaning

There is a column named ‘function’ in the dataset which conflicts R’s base function. This can be annoying later, so we’re going to fix this. Also, the job_id variable won’t do a lot of good either, so we’re dropping this variable now.

Once done, we’ll go ahead to look for missing values.

job <- job %>% 
  rename(func = `function`) %>% 
  select(-job_id)

library(visdat)

vis_miss(job)

It’s clear to see the salary_range is the variable which has the most missing values. Let’s find out why this occured.

LS0tDQp0aXRsZTogIiBGYWtlIEpvYiBQb3N0aW5nIFByZWRpY3Rpb24iDQpvdXRwdXQ6IGh0bWxfbm90ZWJvb2sNCi0tLQ0KDQpgYGB7cn0NCmxpYnJhcnkodGlkeXZlcnNlKQ0KYGBgDQoNCmBgYHtyfQ0Kam9iIDwtIHJlYWRfY3N2KCdkYXRhc2V0L2Zha2Vfam9iX3Bvc3RpbmdzLmNzdicpDQpqb2INCmBgYA0KDQpgYGB7cn0NCmdsaW1wc2Uoam9iKQ0KYGBgDQoNCmBgYHtyfQ0Kc3VtbWFyeShqb2IpDQpgYGANCg0KVGhlcmUgYXJlIDE3LDg4MCByZWNvcmRzLiA0Ljg0MyUgam9icyBwb3N0ZWQgYXJlIGZyYXVkdWxlbnQuIFdlJ3JlIGRlYWxpbmcgd2l0aCBhIGNsYXNzLWltYmFsYW5jZWQgcHJvYmxlbS4NCg0KIyMgRGF0YSBDbGVhbmluZw0KDQpUaGVyZSBpcyBhIGNvbHVtbiBuYW1lZCAnZnVuY3Rpb24nIGluIHRoZSBkYXRhc2V0IHdoaWNoIGNvbmZsaWN0cyBSJ3MgYmFzZSBmdW5jdGlvbi4gVGhpcyBjYW4gYmUgYW5ub3lpbmcgbGF0ZXIsIHNvIHdlJ3JlIGdvaW5nIHRvIGZpeCB0aGlzLg0KQWxzbywgdGhlIGpvYl9pZCB2YXJpYWJsZSB3b24ndCBkbyBhIGxvdCBvZiBnb29kIGVpdGhlciwgc28gd2UncmUgZHJvcHBpbmcgdGhpcyB2YXJpYWJsZSBub3cuDQoNCk9uY2UgZG9uZSwgd2UnbGwgZ28gYWhlYWQgdG8gbG9vayBmb3IgbWlzc2luZyB2YWx1ZXMuDQoNCmBgYHtyfQ0Kam9iIDwtIGpvYiAlPiUgDQogIHJlbmFtZShmdW5jID0gYGZ1bmN0aW9uYCkgJT4lIA0KICBzZWxlY3QoLWpvYl9pZCkNCg0KbGlicmFyeSh2aXNkYXQpDQoNCnZpc19taXNzKGpvYikNCmBgYA0KDQpJdCdzIGNsZWFyIHRvIHNlZSB0aGUgc2FsYXJ5X3JhbmdlIGlzIHRoZSB2YXJpYWJsZSB3aGljaCBoYXMgdGhlIG1vc3QgbWlzc2luZyB2YWx1ZXMuIExldCdzIGZpbmQgb3V0IHdoeSB0aGlzIG9jY3VyZWQuDQoNCg0KDQoNCg0KDQoNCg0KDQoNCg0KDQoNCg0KDQoNCg0KDQoNCg0KDQoNCg0KDQoNCg0KDQoNCg0KDQoNCg0KDQoNCg0KDQoNCg0KDQoNCg0KDQoNCg0KDQoNCg0KDQoNCg0KDQoNCg0KDQoNCg0KDQoNCg0KDQoNCg==